library(dplyr)
library(ggplot2)
library(tidyr)
library(ggplot2)
library(reshape2)
library(data.table)
library(broom)
library(olsrr)
library(corrplot)
library(plotly)
Import data
setwd("D:/Project")
air <- read.csv("Air.csv", sep=";", dec=",")
air <- air[,-c(16, 17)]
Make it long and check data. We can see outliers in -200 - replace it with NA. We can see that one of columns - Non Metanic HydroCarbons concentration have a lot of NAs. Let’s drop it and then drop NAs. We saved data loosing one column. Also we can notice that not all columns have normal distribution
str(air)
## 'data.frame': 9471 obs. of 15 variables:
## $ Date : Factor w/ 392 levels "","01/01/2005",..: 116 116 116 116 116 116 129 129 129 129 ...
## $ Time : Factor w/ 25 levels "","00.00.00",..: 20 21 22 23 24 25 2 3 4 5 ...
## $ CO.GT. : num 2.6 2 2.2 2.2 1.6 1.2 1.2 1 0.9 0.6 ...
## $ PT08.S1.CO. : int 1360 1292 1402 1376 1272 1197 1185 1136 1094 1010 ...
## $ NMHC.GT. : int 150 112 88 80 51 38 31 31 24 19 ...
## $ C6H6.GT. : num 11.9 9.4 9 9.2 6.5 4.7 3.6 3.3 2.3 1.7 ...
## $ PT08.S2.NMHC.: int 1046 955 939 948 836 750 690 672 609 561 ...
## $ NOx.GT. : int 166 103 131 172 131 89 62 62 45 -200 ...
## $ PT08.S3.NOx. : int 1056 1174 1140 1092 1205 1337 1462 1453 1579 1705 ...
## $ NO2.GT. : int 113 92 114 122 116 96 77 76 60 -200 ...
## $ PT08.S4.NO2. : int 1692 1559 1555 1584 1490 1393 1333 1333 1276 1235 ...
## $ PT08.S5.O3. : int 1268 972 1074 1203 1110 949 733 730 620 501 ...
## $ T : num 13.6 13.3 11.9 11 11.2 11.2 11.3 10.7 10.7 10.3 ...
## $ RH : num 48.9 47.7 54 60 59.6 59.2 56.8 60 59.7 60.2 ...
## $ AH : num 0.758 0.726 0.75 0.787 0.789 ...
for (i in 3:ncol(air)) {
air[,i][which(air[,i] == -200)] <- NA
}
air_long <- melt(air)
## Warning in melt(air): The melt generic in data.table has been passed a
## data.frame and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(air). In the next version, this warning will become an error.
air_long <- air_long[,-c(1,2)]
ggplot(air_long, aes(value)) +
geom_histogram() +
facet_wrap(~variable, scales = "free")
## Warning: Removed 18183 rows containing non-finite values (stat_bin).
air <- air[,c(1:4,6:15)]
air <- drop_na(air)
air_long <- melt(air)
## Warning in melt(air): The melt generic in data.table has been passed a
## data.frame and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(air). In the next version, this warning will become an error.
air_long <- air_long[,-c(1,2)]
summary(air)
## Date Time CO.GT. PT08.S1.CO.
## 02/04/2005: 24 10.00.00: 312 Min. : 0.100 Min. : 647
## 03/04/2005: 24 20.00.00: 310 1st Qu.: 1.100 1st Qu.: 956
## 15/03/2005: 24 09.00.00: 309 Median : 1.900 Median :1085
## 16/03/2005: 24 12.00.00: 309 Mean : 2.182 Mean :1120
## 18/03/2005: 24 18.00.00: 309 3rd Qu.: 2.900 3rd Qu.:1254
## 19/03/2005: 24 21.00.00: 309 Max. :11.900 Max. :2040
## (Other) :6797 (Other) :5083
## C6H6.GT. PT08.S2.NMHC. NOx.GT. PT08.S3.NOx.
## Min. : 0.20 Min. : 390.0 Min. : 2.0 Min. : 322.0
## 1st Qu.: 4.90 1st Qu.: 760.0 1st Qu.: 103.0 1st Qu.: 642.0
## Median : 8.80 Median : 931.0 Median : 186.0 Median : 786.0
## Mean :10.55 Mean : 958.5 Mean : 250.7 Mean : 816.9
## 3rd Qu.:14.60 3rd Qu.:1135.0 3rd Qu.: 335.0 3rd Qu.: 947.0
## Max. :63.70 Max. :2214.0 Max. :1479.0 Max. :2683.0
##
## NO2.GT. PT08.S4.NO2. PT08.S5.O3. T RH
## Min. : 2.0 Min. : 551 Min. : 221 Min. :-1.90 Min. : 9.20
## 1st Qu.: 79.0 1st Qu.:1207 1st Qu.: 760 1st Qu.:11.20 1st Qu.:35.30
## Median :110.0 Median :1457 Median :1006 Median :16.80 Median :49.20
## Mean :113.9 Mean :1453 Mean :1058 Mean :17.76 Mean :48.88
## 3rd Qu.:142.0 3rd Qu.:1683 3rd Qu.:1322 3rd Qu.:23.70 3rd Qu.:62.20
## Max. :333.0 Max. :2775 Max. :2523 Max. :44.60 Max. :88.70
##
## AH
## Min. :0.1847
## 1st Qu.:0.6941
## Median :0.9539
## Mean :0.9856
## 3rd Qu.:1.2516
## Max. :2.1806
##
So, we need to normolize our data.
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
air_norm <- as.data.frame(lapply(air[3:14], normalize))
air_long <- melt(air_norm)
## Warning in melt(air_norm): The melt generic in data.table has been passed
## a data.frame and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(air_norm). In the next version, this warning will become an
## error.
## No id variables; using all as measure variables
Check our data. Quantile - the proportion of cases that are less than certain values. If the requirements of “normality” are here - the data should lie on a diagonal line. We can see that several columns do not have normal distribution.
#CO.GT
qqnorm(air_norm$CO.GT., pch = 1, frame = FALSE)
qqline(air_norm$CO.GT., col = "steelblue", lwd = 2)
#PT08.S1.CO
qqnorm(air_norm$PT08.S1.CO., pch = 1, frame = FALSE)
qqline(air_norm$PT08.S1.CO., col = "steelblue", lwd = 2)
#C6H6.GT
qqnorm(air_norm$C6H6.GT., pch = 1, frame = FALSE)
qqline(air_norm$C6H6.GT., col = "steelblue", lwd = 2)
#PT08.S2.NMHC
qqnorm(air_norm$PT08.S2.NMHC., pch = 1, frame = FALSE)
qqline(air_norm$PT08.S2.NMHC., col = "steelblue", lwd = 2)
#NOx.GT
qqnorm(air_norm$NOx.GT., pch = 1, frame = FALSE)
qqline(air_norm$NOx.GT., col = "steelblue", lwd = 2)
#PT08.S3.NOx
qqnorm(air_norm$PT08.S3.NOx., pch = 1, frame = FALSE)
qqline(air_norm$PT08.S3.NOx., col = "steelblue", lwd = 2)
#NO2.GT
qqnorm(air_norm$NO2.GT., pch = 1, frame = FALSE)
qqline(air_norm$NO2.GT., col = "steelblue", lwd = 2)
#PT08.S4.NO2
qqnorm(air_norm$PT08.S4.NO2., pch = 1, frame = FALSE)
qqline(air_norm$PT08.S4.NO2., col = "steelblue", lwd = 2)
#PT08.S5.O3
qqnorm(air_norm$PT08.S5.O3., pch = 1, frame = FALSE)
qqline(air_norm$PT08.S5.O3., col = "steelblue", lwd = 2)
#T
qqnorm(air_norm$T, pch = 1, frame = FALSE)
qqline(air_norm$T, col = "steelblue", lwd = 2)
#RH
qqnorm(air_norm$RH, pch = 1, frame = FALSE)
qqline(air_norm$RH, col = "steelblue", lwd = 2)
#AH
qqnorm(air_norm$AH, pch = 1, frame = FALSE)
qqline(air_norm$AH, col = "steelblue", lwd = 2)
Response C6H6.GT. We can see columns that have linear response to C6H6.GT.
mod <- lm(data = air_norm, C6H6.GT. ~ CO.GT.)
summary(mod)
##
## Call:
## lm(formula = C6H6.GT. ~ CO.GT., data = air_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.23935 -0.02496 -0.00249 0.02357 0.76733
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0050753 0.0009115 5.568 2.67e-08 ***
## CO.GT. 0.8952134 0.0042471 210.782 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04321 on 6939 degrees of freedom
## Multiple R-squared: 0.8649, Adjusted R-squared: 0.8649
## F-statistic: 4.443e+04 on 1 and 6939 DF, p-value: < 2.2e-16
augment(mod)%>%
ggplot(aes(x = CO.GT., y = C6H6.GT.))+
geom_point()+
geom_line(aes(y= .fitted), color = "blue", size = 1)
residuals(mod) %>% hist(main = "Residuals CO.GT.")
plot(mod, which = 1)
mod1 <- lm(data = air_norm, C6H6.GT. ~ PT08.S1.CO.)
summary(mod1)
##
## Call:
## lm(formula = C6H6.GT. ~ PT08.S1.CO., data = air_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.18826 -0.03559 -0.00288 0.03079 0.70768
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.059959 0.001613 -37.18 <2e-16 ***
## PT08.S1.CO. 0.656927 0.004312 152.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0564 on 6939 degrees of freedom
## Multiple R-squared: 0.7699, Adjusted R-squared: 0.7699
## F-statistic: 2.322e+04 on 1 and 6939 DF, p-value: < 2.2e-16
augment(mod1)%>%
ggplot(aes(x = PT08.S1.CO., y = C6H6.GT.))+
geom_point()+
geom_line(aes(y= .fitted), color = "blue", size = 1)
residuals(mod1) %>% hist(main = "Residuals PT08.S1.CO.")
plot(mod1, which = 1)
mod2 <- lm(data = air_norm, C6H6.GT. ~ PT08.S2.NMHC.)
summary(mod2)
##
## Call:
## lm(formula = C6H6.GT. ~ PT08.S2.NMHC., data = air_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.018385 -0.015140 -0.007871 0.007973 0.287650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0856861 0.0006204 -138.1 <2e-16 ***
## PT08.S2.NMHC. 0.7980363 0.0018053 442.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02177 on 6939 degrees of freedom
## Multiple R-squared: 0.9657, Adjusted R-squared: 0.9657
## F-statistic: 1.954e+05 on 1 and 6939 DF, p-value: < 2.2e-16
augment(mod2)%>%
ggplot(aes(x = PT08.S2.NMHC., y = C6H6.GT.))+
geom_point()+
geom_line(aes(y= .fitted), color = "blue", size = 1)
residuals(mod2) %>% hist(main = "Residuals PT08.S2.NMHC.")
plot(mod2, which = 1)
mod3 <- lm(data = air_norm, C6H6.GT. ~ NOx.GT.)
summary(mod3)
##
## Call:
## lm(formula = C6H6.GT. ~ NOx.GT., data = air_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.20002 -0.06256 -0.01753 0.04394 0.61415
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.062395 0.001528 40.84 <2e-16 ***
## NOx.GT. 0.597921 0.006951 86.01 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08179 on 6939 degrees of freedom
## Multiple R-squared: 0.516, Adjusted R-squared: 0.5159
## F-statistic: 7398 on 1 and 6939 DF, p-value: < 2.2e-16
augment(mod3)%>%
ggplot(aes(x = NOx.GT., y = C6H6.GT.))+
geom_point()+
geom_line(aes(y= .fitted), color = "blue", size = 1)
residuals(mod3) %>% hist(main = "Residuals NOx.GT.")
plot(mod3, which = 1)
mod4 <- lm(data = air_norm, C6H6.GT. ~ PT08.S3.NOx.)
summary(mod4)
##
## Call:
## lm(formula = C6H6.GT. ~ PT08.S3.NOx., data = air_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.14285 -0.05823 -0.01275 0.03632 0.69946
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.330684 0.002140 154.49 <2e-16 ***
## PT08.S3.NOx. -0.799672 0.009101 -87.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08089 on 6939 degrees of freedom
## Multiple R-squared: 0.5267, Adjusted R-squared: 0.5266
## F-statistic: 7721 on 1 and 6939 DF, p-value: < 2.2e-16
augment(mod4)%>%
ggplot(aes(x = PT08.S3.NOx., y = C6H6.GT.))+
geom_point()+
geom_line(aes(y= .fitted), color = "blue", size = 1)
residuals(mod4) %>% hist(main = "Residuals PT08.S3.NOx.")
plot(mod4, which = 1)
mod5 <- lm(data = air_norm, C6H6.GT. ~ NO2.GT.)
summary(mod5)
##
## Call:
## lm(formula = C6H6.GT. ~ NO2.GT., data = air_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.28123 -0.05625 -0.00966 0.04071 0.68139
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.004056 0.002881 -1.408 0.159
## NO2.GT. 0.494451 0.007848 63.005 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09377 on 6939 degrees of freedom
## Multiple R-squared: 0.3639, Adjusted R-squared: 0.3638
## F-statistic: 3970 on 1 and 6939 DF, p-value: < 2.2e-16
augment(mod5)%>%
ggplot(aes(x = NO2.GT., y = C6H6.GT.))+
geom_point()+
geom_line(aes(y= .fitted), color = "blue", size = 1)
residuals(mod5) %>% hist(main = "Residuals NO2.GT.")
plot(mod5, which = 1)
mod6 <- lm(data = air_norm, C6H6.GT. ~ PT08.S4.NO2.)
summary(mod6)
##
## Call:
## lm(formula = C6H6.GT. ~ PT08.S4.NO2., data = air_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.15656 -0.05635 -0.00724 0.04597 0.79223
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.065500 0.002506 -26.14 <2e-16 ***
## PT08.S4.NO2. 0.563771 0.005755 97.96 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07616 on 6939 degrees of freedom
## Multiple R-squared: 0.5803, Adjusted R-squared: 0.5803
## F-statistic: 9596 on 1 and 6939 DF, p-value: < 2.2e-16
augment(mod6)%>%
ggplot(aes(x = PT08.S4.NO2., y = C6H6.GT.))+
geom_point()+
geom_line(aes(y= .fitted), color = "blue", size = 1)
residuals(mod6) %>% hist(main = "Residuals PT08.S4.NO2.")
plot(mod6, which = 1)
mod7 <- lm(data = air_norm,C6H6.GT. ~ PT08.S5.O3.)
summary(mod7)
##
## Call:
## lm(formula = C6H6.GT. ~ PT08.S5.O3., data = air_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.24981 -0.03674 -0.00059 0.03326 0.52482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.045328 0.001642 -27.61 <2e-16 ***
## PT08.S5.O3. 0.573303 0.004063 141.11 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05977 on 6939 degrees of freedom
## Multiple R-squared: 0.7416, Adjusted R-squared: 0.7415
## F-statistic: 1.991e+04 on 1 and 6939 DF, p-value: < 2.2e-16
augment(mod7)%>%
ggplot(aes(x = PT08.S5.O3., y = C6H6.GT.))+
geom_point()+
geom_line(aes(y= .fitted), color = "blue", size = 1)
residuals(mod7) %>% hist(main = "Residuals PT08.S5.O3.")
plot(mod7, which = 1)
mod8 <- lm(data = air_norm,C6H6.GT. ~ `T`)
summary(mod8)
##
## Call:
## lm(formula = C6H6.GT. ~ T, data = air_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.17626 -0.08503 -0.02819 0.05791 0.86597
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.113686 0.003377 33.66 <2e-16 ***
## T 0.116814 0.007286 16.03 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1155 on 6939 degrees of freedom
## Multiple R-squared: 0.03572, Adjusted R-squared: 0.03558
## F-statistic: 257.1 on 1 and 6939 DF, p-value: < 2.2e-16
augment(mod8)%>%
ggplot(aes(x = `T`, y = C6H6.GT.))+
geom_point()+
geom_line(aes(y= .fitted), color = "blue", size = 1)
residuals(mod8) %>% hist(main = "Residuals T")
plot(mod8, which = 1)
mod9 <- lm(data = air_norm, C6H6.GT. ~ RH)
summary(mod9)
##
## Call:
## lm(formula = C6H6.GT. ~ RH, data = air_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.16472 -0.08857 -0.02776 0.06256 0.83736
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.168841 0.003508 48.131 <2e-16 ***
## RH -0.011576 0.006434 -1.799 0.0721 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1175 on 6939 degrees of freedom
## Multiple R-squared: 0.0004662, Adjusted R-squared: 0.0003221
## F-statistic: 3.236 on 1 and 6939 DF, p-value: 0.07206
augment(mod9)%>%
ggplot(aes(x = RH, y = C6H6.GT.))+
geom_point()+
geom_line(aes(y= .fitted), color = "blue", size = 1)
residuals(mod9) %>% hist(main = "Residuals RH")
plot(mod9, which = 1)
mod10 <- lm(data = air_norm, C6H6.GT. ~ AH)
summary(mod10)
##
## Call:
## lm(formula = C6H6.GT. ~ AH, data = air_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.19607 -0.08744 -0.02681 0.06081 0.86382
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.119150 0.003096 38.49 <2e-16 ***
## AH 0.109438 0.006899 15.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1155 on 6939 degrees of freedom
## Multiple R-squared: 0.035, Adjusted R-squared: 0.03486
## F-statistic: 251.6 on 1 and 6939 DF, p-value: < 2.2e-16
augment(mod10)%>%
ggplot(aes(x = AH, y = C6H6.GT.))+
geom_point()+
geom_line(aes(y= .fitted), color = "blue", size = 1)
residuals(mod10) %>% hist(main = "Residuals AH")
plot(mod10, which = 1)
pairs(air_norm[,sapply(air_norm, is.double)])
Check correlation.
corrplot.mixed(cor(air_norm, method = "kendall"), number.cex = .7)
Check multicolinearity. I decided to see what’s going on by hands. For multic. we need to have CI>30 and VIF>10. Also, we can see it when coeficients of columns alone are not significant but together they give significant p-value.
mod <- lm(C6H6.GT.~PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx. + NO2.GT. + PT08.S4.NO2., data = air_norm)
summary(mod)
##
## Call:
## lm(formula = C6H6.GT. ~ PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx. +
## NO2.GT. + PT08.S4.NO2., data = air_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.107541 -0.011598 -0.003860 0.007867 0.273775
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.128346 0.001736 -73.928 < 2e-16 ***
## PT08.S2.NMHC. 0.811767 0.005821 139.463 < 2e-16 ***
## NOx.GT. 0.104145 0.003158 32.974 < 2e-16 ***
## PT08.S3.NOx. 0.130556 0.003545 36.827 < 2e-16 ***
## NO2.GT. -0.042904 0.002939 -14.597 < 2e-16 ***
## PT08.S4.NO2. 0.019687 0.003667 5.369 8.19e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01863 on 6935 degrees of freedom
## Multiple R-squared: 0.9749, Adjusted R-squared: 0.9749
## F-statistic: 5.389e+04 on 5 and 6935 DF, p-value: < 2.2e-16
X <- model.matrix(~PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx. + NO2.GT. + PT08.S4.NO2., data = air_norm)
XX <- t(X) %*% X
eigen <- eigen(XX)
CI <- sqrt(max(eigen$values) / min(eigen$values))
CI
## [1] 38.71032
# Also can count it via formula 1/1-R^2
ols_coll_diag(mod)
## Tolerance and Variance Inflation Factor
## ---------------------------------------
## Variables Tolerance VIF
## 1 PT08.S2.NMHC. 0.07043174 14.198143
## 2 NOx.GT. 0.25131144 3.979126
## 3 PT08.S3.NOx. 0.34956090 2.860732
## 4 NO2.GT. 0.28139450 3.553730
## 5 PT08.S4.NO2. 0.14737223 6.785539
##
##
## Eigenvalue and Condition Index
## ------------------------------
## Eigenvalue Condition Index intercept PT08.S2.NMHC. NOx.GT.
## 1 5.139259035 1.000000 0.0005983670 4.445038e-04 0.002970002
## 2 0.598258522 2.930932 0.0017127027 1.080741e-03 0.053610656
## 3 0.182088746 5.312619 0.0001264032 9.057705e-03 0.130894451
## 4 0.059729372 9.275905 0.0029993206 8.942371e-06 0.506597242
## 5 0.013526138 19.492311 0.8122319948 1.333775e-01 0.028926663
## 6 0.007138187 26.832201 0.1823312118 8.560306e-01 0.277000986
## PT08.S3.NOx. NO2.GT. PT08.S4.NO2.
## 1 0.001760862 0.001513613 6.783985e-04
## 2 0.065813839 0.002081996 3.339913e-05
## 3 0.042990083 0.018947690 4.823296e-02
## 4 0.079221335 0.432104143 1.009362e-02
## 5 0.806637576 0.118860131 6.737641e-04
## 6 0.003576305 0.426492427 9.402879e-01
residuals(mod) %>% hist(main = "Residuals PT08.S2.NMHC. + NOx.GT. + PT08.S3.NOx. + NO2.GT. + PT08.S4.NO2.")
plot(mod, which = 1)
mod <- lm(C6H6.GT.~NOx.GT. + PT08.S3.NOx. + NO2.GT.+PT08.S4.NO2., data = air_norm)
summary(mod)
##
## Call:
## lm(formula = C6H6.GT. ~ NOx.GT. + PT08.S3.NOx. + NO2.GT. + PT08.S4.NO2.,
## data = air_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.13920 -0.02335 -0.00319 0.01927 0.61839
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.143806 0.003379 -42.557 <2e-16 ***
## NOx.GT. 0.356365 0.005050 70.564 <2e-16 ***
## PT08.S3.NOx. 0.008813 0.006702 1.315 0.189
## NO2.GT. 0.158388 0.004994 31.717 <2e-16 ***
## PT08.S4.NO2. 0.472327 0.003329 141.894 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03634 on 6936 degrees of freedom
## Multiple R-squared: 0.9045, Adjusted R-squared: 0.9045
## F-statistic: 1.643e+04 on 4 and 6936 DF, p-value: < 2.2e-16
X <- model.matrix(~NOx.GT. + PT08.S3.NOx. + NO2.GT.+PT08.S4.NO2., data = air_norm)
XX <- t(X) %*% X
eigen <- eigen(XX)
CI <- sqrt(max(eigen$values) / min(eigen$values))
CI
## [1] 21.5376
# Also can count it via formula 1/1-R^2
ols_coll_diag(mod)
## Tolerance and Variance Inflation Factor
## ---------------------------------------
## Variables Tolerance VIF
## 1 NOx.GT. 0.3739089 2.674448
## 2 PT08.S3.NOx. 0.3721233 2.687282
## 3 NO2.GT. 0.3708137 2.696772
## 4 PT08.S4.NO2. 0.6803347 1.469865
##
##
## Eigenvalue and Condition Index
## ------------------------------
## Eigenvalue Condition Index intercept NOx.GT. PT08.S3.NOx. NO2.GT.
## 1 4.21432517 1.000000 9.087272e-04 0.006362944 0.002997369 0.002945636
## 2 0.56258305 2.736974 1.141258e-03 0.113454921 0.066527093 0.005957732
## 3 0.15130645 5.277586 8.538575e-05 0.123219339 0.081423501 0.012714553
## 4 0.05972556 8.400089 2.918017e-03 0.755643783 0.082724495 0.568361415
## 5 0.01205978 18.693662 9.949466e-01 0.001319013 0.766327542 0.410020664
## PT08.S4.NO2.
## 1 4.556101e-03
## 2 1.072926e-05
## 3 4.090600e-01
## 4 4.973504e-02
## 5 5.366381e-01
residuals(mod) %>% hist(main = "Residuals NOx.GT. + PT08.S3.NOx. + NO2.GT.+PT08.S4.NO2.")
plot(mod, which = 1)
mod <- lm(C6H6.GT.~NOx.GT. + PT08.S3.NOx.+PT08.S4.NO2., data = air_norm)
summary(mod)
##
## Call:
## lm(formula = C6H6.GT. ~ NOx.GT. + PT08.S3.NOx. + PT08.S4.NO2.,
## data = air_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.15046 -0.02558 -0.00366 0.02252 0.62349
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.079501 0.002893 -27.48 <2e-16 ***
## NOx.GT. 0.446413 0.004469 99.89 <2e-16 ***
## PT08.S3.NOx. -0.067482 0.006693 -10.08 <2e-16 ***
## PT08.S4.NO2. 0.447808 0.003464 129.26 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03888 on 6937 degrees of freedom
## Multiple R-squared: 0.8907, Adjusted R-squared: 0.8906
## F-statistic: 1.884e+04 on 3 and 6937 DF, p-value: < 2.2e-16
X <- model.matrix(~ NOx.GT. + PT08.S3.NOx.+PT08.S4.NO2., data = air_norm)
XX <- t(X) %*% X
eigen <- eigen(XX)
CI <- sqrt(max(eigen$values) / min(eigen$values))
CI
## [1] 19.31043
# Also can count it via formula 1/1-R^2
ols_coll_diag(mod)
## Tolerance and Variance Inflation Factor
## ---------------------------------------
## Variables Tolerance VIF
## 1 NOx.GT. 0.5466887 1.829194
## 2 PT08.S3.NOx. 0.4271601 2.341043
## 3 PT08.S4.NO2. 0.7191206 1.390587
##
##
## Eigenvalue and Condition Index
## ------------------------------
## Eigenvalue Condition Index intercept NOx.GT. PT08.S3.NOx. PT08.S4.NO2.
## 1 3.31230936 1.000000 2.323060e-03 0.01406454 0.005966555 0.007892873
## 2 0.52303786 2.516511 7.982151e-04 0.23453090 0.072741803 0.001422487
## 3 0.14668190 4.752008 5.012680e-06 0.33438471 0.105029622 0.396339407
## 4 0.01797088 13.576280 9.968737e-01 0.41701984 0.816262020 0.594345233
residuals(mod) %>% hist(main = "Residuals NOx.GT. + PT08.S3.NOx. + PT08.S4.NO2")
plot(mod, which = 1)
mod <- lm(C6H6.GT.~NOx.GT. +PT08.S4.NO2., data = air_norm)
summary(mod)
##
## Call:
## lm(formula = C6H6.GT. ~ NOx.GT. + PT08.S4.NO2., data = air_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.16081 -0.02592 -0.00350 0.02360 0.62284
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.105500 0.001320 -79.91 <2e-16 ***
## NOx.GT. 0.475674 0.003423 138.97 <2e-16 ***
## PT08.S4.NO2. 0.464895 0.003043 152.76 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03916 on 6938 degrees of freedom
## Multiple R-squared: 0.8891, Adjusted R-squared: 0.889
## F-statistic: 2.781e+04 on 2 and 6938 DF, p-value: < 2.2e-16
X <- model.matrix(~NOx.GT. +PT08.S4.NO2., data = air_norm)
XX <- t(X) %*% X
eigen <- eigen(XX)
CI <- sqrt(max(eigen$values) / min(eigen$values))
CI
## [1] 8.491566
# Also can count it via formula 1/1-R^2
ols_coll_diag(mod)
## Tolerance and Variance Inflation Factor
## ---------------------------------------
## Variables Tolerance VIF
## 1 NOx.GT. 0.945341 1.057819
## 2 PT08.S4.NO2. 0.945341 1.057819
##
##
## Eigenvalue and Condition Index
## ------------------------------
## Eigenvalue Condition Index intercept NOx.GT. PT08.S4.NO2.
## 1 2.64611815 1.000000 0.01668626 0.0446849114 0.01658563
## 2 0.28497117 3.047222 0.06870324 0.9551867442 0.06551725
## 3 0.06891068 6.196713 0.91461051 0.0001283444 0.91789712
residuals(mod) %>% hist(main = "Residuals NOx.GT. +PT08.S4.NO2.")
plot(mod, which = 1)
Final model Residuals should be normally distributed and the Q-Q Plot will show this. If residuals follow close to a straight line on this plot, it is a good indication they are normally distributed.
# Good model but NOx.GT. not normally distributed
# I've decided not to transform data, because I don't want to see log-log transformation
mod <- lm(C6H6.GT.~NOx.GT. + PT08.S3.NOx. + NO2.GT.+PT08.S4.NO2., data = air_norm)
summary(mod)
##
## Call:
## lm(formula = C6H6.GT. ~ NOx.GT. + PT08.S3.NOx. + NO2.GT. + PT08.S4.NO2.,
## data = air_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.13920 -0.02335 -0.00319 0.01927 0.61839
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.143806 0.003379 -42.557 <2e-16 ***
## NOx.GT. 0.356365 0.005050 70.564 <2e-16 ***
## PT08.S3.NOx. 0.008813 0.006702 1.315 0.189
## NO2.GT. 0.158388 0.004994 31.717 <2e-16 ***
## PT08.S4.NO2. 0.472327 0.003329 141.894 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03634 on 6936 degrees of freedom
## Multiple R-squared: 0.9045, Adjusted R-squared: 0.9045
## F-statistic: 1.643e+04 on 4 and 6936 DF, p-value: < 2.2e-16
X <- model.matrix(~NOx.GT. + PT08.S3.NOx. + NO2.GT.+PT08.S4.NO2., data = air_norm)
XX <- t(X) %*% X
eigen <- eigen(XX)
CI <- sqrt(max(eigen$values) / min(eigen$values))
CI
## [1] 21.5376
# Also can count it via formula 1/1-R^2
ols_coll_diag(mod)
## Tolerance and Variance Inflation Factor
## ---------------------------------------
## Variables Tolerance VIF
## 1 NOx.GT. 0.3739089 2.674448
## 2 PT08.S3.NOx. 0.3721233 2.687282
## 3 NO2.GT. 0.3708137 2.696772
## 4 PT08.S4.NO2. 0.6803347 1.469865
##
##
## Eigenvalue and Condition Index
## ------------------------------
## Eigenvalue Condition Index intercept NOx.GT. PT08.S3.NOx. NO2.GT.
## 1 4.21432517 1.000000 9.087272e-04 0.006362944 0.002997369 0.002945636
## 2 0.56258305 2.736974 1.141258e-03 0.113454921 0.066527093 0.005957732
## 3 0.15130645 5.277586 8.538575e-05 0.123219339 0.081423501 0.012714553
## 4 0.05972556 8.400089 2.918017e-03 0.755643783 0.082724495 0.568361415
## 5 0.01205978 18.693662 9.949466e-01 0.001319013 0.766327542 0.410020664
## PT08.S4.NO2.
## 1 4.556101e-03
## 2 1.072926e-05
## 3 4.090600e-01
## 4 4.973504e-02
## 5 5.366381e-01
ols_plot_resid_fit_spread(mod)
ols_plot_obs_fit(mod)
mod <- lm(C6H6.GT.~CO.GT. + PT08.S4.NO2., data = air_norm)
summary(mod)
##
## Call:
## lm(formula = C6H6.GT. ~ CO.GT. + PT08.S4.NO2., data = air_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.16444 -0.01879 -0.00216 0.01556 0.76399
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.051025 0.001126 -45.33 <2e-16 ***
## CO.GT. 0.718581 0.004320 166.32 <2e-16 ***
## PT08.S4.NO2. 0.215265 0.003322 64.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03411 on 6938 degrees of freedom
## Multiple R-squared: 0.9159, Adjusted R-squared: 0.9158
## F-statistic: 3.776e+04 on 2 and 6938 DF, p-value: < 2.2e-16
X <- model.matrix(~CO.GT. + PT08.S4.NO2., data = air_norm)
XX <- t(X) %*% X
eigen <- eigen(XX)
CI <- sqrt(max(eigen$values) / min(eigen$values))
CI
## [1] 13.33846
ols_coll_diag(mod)
## Tolerance and Variance Inflation Factor
## ---------------------------------------
## Variables Tolerance VIF
## 1 CO.GT. 0.6020484 1.660996
## 2 PT08.S4.NO2. 0.6020484 1.660996
##
##
## Eigenvalue and Condition Index
## ------------------------------
## Eigenvalue Condition Index intercept CO.GT. PT08.S4.NO2.
## 1 2.76742714 1.000000 0.01578747 0.02263258 0.010088266
## 2 0.18068900 3.913562 0.29140318 0.63351920 0.006558737
## 3 0.05188386 7.303347 0.69280934 0.34384822 0.983352997
ols_plot_resid_fit_spread(mod)
ols_plot_obs_fit(mod)
Separated data in 75:25% We can see that R^2 is close to 1 which says about linear dependence. P-value is close to 0 - independent variables explain the dynamics of the dependent variable.
set.seed(2)
sep <- sample.int(n = nrow(air_norm), size = floor(.75*nrow(air_norm)))
train <- air_norm[sep,]
test <- air_norm[-sep,]
train1 <- train[,c('C6H6.GT.', 'CO.GT.', 'PT08.S4.NO2.')]
test1 <- test[,c('C6H6.GT.', 'CO.GT.', 'PT08.S4.NO2.')]
mod1 <- lm(C6H6.GT.~CO.GT. + PT08.S4.NO2., data = train1)
a1 <- summary(mod1)
pred1 <- predict(mod1, newdata = test1)
test1$C6H6.GT._pred <- pred1
head(test1)
test1 <- rbindlist(list(test1[,c(2,3,1)], test1[,c(2,3,4)]))
train1$type <- 'train'
test1$type <- 'test'
test1[1:(nrow(test1)/2),4] <- 'real value'
all <- rbind(train1, test1)
plot_ly(data = all,
z = ~C6H6.GT.,
y = ~PT08.S4.NO2.,
x = ~CO.GT., opacity = 0.7, color = ~type)%>%
layout(title = paste("R2", round(a1$r.squared, 3),
sep = ": "),
paste("pvalue", 2.2e-16, sep = ": "),
xaxis = list(title = "CO.GT.",
zeroline = FALSE),
yaxis = list(title = "PT08.S4.NO2.",
zeroline = FALSE),
zaxis = list(title = "C6H6.GT.",
zeroline = FALSE))